Here we simulate from the true model, with the following parameters.
For each simulated data, we fit a ZINB model with all the possible combinations (16) of the following parameters:
See bias, variance, and MSE in separate Rmd files (simZeisel_estimates_25.Rmd, simZeisel_estimates_5.Rmd, and simZeisel_estimates_75.Rmd).
load("./datasets/simZeisel_var2_z25_dist.rda")
corr25 <- lapply(1:12, function(i) rowMeans(sapply(res, function(x) x[[i]])))
corr25 = do.call(cbind, corr25)
corr25 = data.frame(corr25, stringsAsFactors = F)
colnames(corr25) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
corr25$pzero = .25
boxplot(corr25[,1:12], main="Correlation between true and estimated sample distances in W\npzero = 0.25", border=c(3, 1, 2, rep(1,10)), xlab="", ylab="Correlation", las=2)
abline(h=1,lty = 2)
load("./datasets/simZeisel_var2_z45_dist.rda")
corr45 <- lapply(1:12, function(i) rowMeans(sapply(res, function(x) x[[i]])))
corr45 = do.call(cbind, corr45)
corr45 = data.frame(corr45, stringsAsFactors = F)
colnames(corr45) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
corr45$pzero = .45
boxplot(corr45[,1:12], main="Correlation between true and estimated sample distances in W\npzero = 0.45", border=c(3, 1, 2, rep(1,10)), xlab="method", ylab="Correlation", las=2)
abline(h=1,lty = 2)
load("./datasets/simZeisel_var2_z75_dist.rda")
corr75 <- lapply(1:12, function(i) rowMeans(sapply(res, function(x) x[[i]])))
corr75 = do.call(cbind, corr75)
corr75 = data.frame(corr75, stringsAsFactors = F)
colnames(corr75) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
corr75$pzero = .75
boxplot(corr75[,1:12], main="Correlation between true and estimated sample distances in W\npzero = 0.75", border=c(3, 1, 2, rep(1,10)), xlab="method", ylab="Correlation", las=2)
abline(h=1,lty = 2)
corr = rbind(corr25, corr45, corr75)
corr = melt(corr, id.vars = 'pzero')
ggplot(corr, aes(x = factor(pzero), y= value, col = factor(pzero))) +
geom_boxplot(outlier.shape = NA) + facet_grid(~ variable) +
ggtitle('Correlation between true and estimated sample distances in W') +
ylab('Correlation') + xlab('Method')
corr = lapply(1:2, function(x){
lapply(c(25,45,75), function(y){
nn = sprintf("./datasets/simZeisel_var%s_z%s_dist.rda", x, y)
load(nn)
cc <- lapply(1:12, function(i) rowMeans(sapply(res, function(x) x[[i]])))
do.call(cbind, cc)
})
})
corr = do.call(rbind, do.call(rbind, corr))
corr = data.frame(corr, stringsAsFactors = F)
colnames(corr) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
corr$pzero = rep(rep(c(.25,.45,.75), each = 100), 2)
corr$var = rep(paste('Clustering', 1:2), each = 300)
corrMolten = melt(corr, id.vars = c('pzero', 'var'))
corrSum = corrMolten %>% group_by(pzero, var, variable) %>%
summarize(mean = mean(value), sd = sd(value)) %>% ungroup() %>%
as.data.frame()
corrSum$pzero = factor(corrSum$pzero)
c1 = corrSum[corrSum$variable != 'true W',]
c1$method = sapply(strsplit(as.vector(c1$variable), ' '), '[[', 1)
ggplot(c1, aes(x = pzero, y = mean, col = variable, group = variable)) +
geom_point() + geom_line() + labs(col='') +
theme_bw() + xlab('Zero Fraction') + facet_grid( ~ var) +
ylab('Correlation between true and estimated sample distances in W')
#geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .1)
load("./datasets/simZeisel_var2_z25_dist.rda")
sil25 <- lapply(13:length(res[[1]]), function(i){
rowMeans(sapply(res, function(x) x[[i]]))
})
sil25 = do.call(cbind, sil25)
sil25 = data.frame(sil25,stringsAsFactors = F)
colnames(sil25) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
sil25$pzero = .25
boxplot(sil25[,1:12], main="Silhouette width of true labels\npzero = 0.25",
border=c(3, 1, 2, rep(1,10)), xlab="",
ylab="Silhouette width", las=2)
boxplot(sil25[,1:12] - sil25[,1],
main="Difference of Silhouette width of true labels\npzero = 0.25",
border=c(3, 1, 2, rep(1,10)), xlab="",
ylab="Silhouette width - True Silhouette width", las=2)
abline(h = 0, lty = 2)
load("./datasets/simZeisel_var2_z45_dist.rda")
sil45 <- lapply(13:length(res[[1]]), function(i){
rowMeans(sapply(res, function(x) x[[i]]))
})
sil45 = do.call(cbind, sil45)
sil45 = data.frame(sil45,stringsAsFactors = F)
colnames(sil45) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
sil45$pzero = .45
boxplot(sil45[,1:12], main="Silhouette width of true labels\npzero = 0.45",
border=c(3, 1, 2, rep(1,10)), xlab="",
ylab="Silhouette width", las=2)
boxplot(sil45[,1:12] - sil45[,1],
main="Difference of Silhouette width of true labels\npzero = 0.45",
border=c(3, 1, 2, rep(1,10)), xlab="",
ylab="Silhouette width - True Silhouette width", las=2)
abline(h = 0, lty = 2)
load("./datasets/simZeisel_var2_z75_dist.rda")
sil75 <- lapply(13:length(res[[1]]), function(i){
rowMeans(sapply(res, function(x) x[[i]]))
})
sil75 = do.call(cbind, sil75)
sil75 = data.frame(sil75,stringsAsFactors = F)
colnames(sil75) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
sil75$pzero = .75
boxplot(sil75[,1:12], main="Silhouette width of true labels\npzero = 0.75",
border=c(3, 1, 2, rep(1,10)), xlab="",
ylab="Silhouette width", las=2)
boxplot(sil75[,1:12] - sil75[,1],
main="Difference of Silhouette width of true labels\npzero = 0.75",
border=c(3, 1, 2, rep(1,10)), xlab="",
ylab="Silhouette width - True Silhouette width", las=2)
abline(h = 0, lty = 2)
sil = rbind(sil25, sil45, sil75)
sil = melt(sil, id.vars = 'pzero')
ggplot(sil, aes(x = factor(pzero), y= value, col = factor(pzero))) +
geom_boxplot(outlier.shape = NA) + facet_grid(~ variable) +
ggtitle('Silhouette width of true labels') +
ylab('Silhouette width') + xlab('Zero Fraction')
sil = lapply(1:2, function(x){
lapply(c(25,45,75), function(y){
nn = sprintf("./datasets/simZeisel_var%s_z%s_dist.rda", x, y)
load(nn)
cc <- lapply(13:length(res[[1]]), function(i){
rowMeans(sapply(res, function(x) x[[i]]))
})
do.call(cbind, cc)
})
})
sil = do.call(rbind, do.call(rbind, sil))
sil = data.frame(sil, stringsAsFactors = F)
colnames(sil) <- c("true W", paste0("zinb k=", 1:4),
"PCA (k=2)", "PCA TC (k=2)", "PCA TMM (k=2)",
"PCA FQ (k=2)",
'ZIFA (k=2)', 'ZIFA TMM (k=2)', 'ZIFA FQ (k=2)')
sil = sil[,1:12] - sil[,1]
sil$pzero = rep(rep(c(.25,.45,.75), each = 100), 2)
sil$var = rep(paste('Clustering', 1:2), each = 300)
silMolten = melt(sil, id.vars = c('pzero', 'var'))
silSum = silMolten %>% group_by(pzero, var, variable) %>%
summarize(mean = mean(value), sd = sd(value)) %>% ungroup() %>%
as.data.frame()
silSum$pzero = factor(silSum$pzero)
c1 = silSum[silSum$variable != 'true W',]
c1$method = sapply(strsplit(as.vector(c1$variable), ' '), '[[', 1)
ggplot(c1, aes(x = pzero, y = mean, col = variable, group = variable)) +
geom_point() + geom_line() + labs(col='') +
theme_bw() + xlab('Zero Fraction') + facet_grid( ~ var) +
geom_hline(yintercept = 0, col = 'gray') +
ylab('silelation between true and estimated sample distances in W')
#geom_errorbar(aes(ymin = mean - sd, ymax = mean + sd), width = .1)
For one simulated dataset with the right parameters (Vintercept, genewise dispersion, K= 2).
load("./datasets/simZeisel_var2_z25.rda")
load("./datasets/simZeisel_var2_z25_zifaFQ.rda")
load("./datasets/simZeisel_var2_z25_fitted.rda")
biotrue = factor(bio)
fit = fittedSim[[2]][[1]][[2]][[1]]
fq <- betweenLaneNormalization(t(simData[[1]]$counts), which="full")
pca <- prcomp(t(log1p(fq)))
par(mfrow=c(3,4))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.25',
xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.25',
xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA FQ\nfZero = 0.25')
plot(zifaFQ[[1]], col = biotrue, main = 'ZIFA FQ\nfZero = 0.25')
load("./datasets/simZeisel_var2_z45.rda")
load("./datasets/simZeisel_var2_z45_zifaFQ.rda")
load("./datasets/simZeisel_var2_z45_fitted.rda")
biotrue = factor(bio)
fit = fittedSim[[2]][[1]][[2]][[1]]
fq <- betweenLaneNormalization(t(simData[[1]]$counts), which="full")
pca <- prcomp(t(log1p(fq)))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.45',
xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.45',
xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA FQ\nfZero = 0.45')
plot(zifaFQ[[1]], col = biotrue, main = 'ZIFA FQ\nfZero = 0.45')
load("./datasets/simZeisel_var2_z75.rda")
load("./datasets/simZeisel_var2_z75_zifaFQ.rda")
load("./datasets/simZeisel_var2_z75_fitted.rda")
biotrue = factor(bio)
fit = fittedSim[[2]][[1]][[2]][[1]]
fq <- betweenLaneNormalization(t(simData[[1]]$counts), which="full")
pca <- prcomp(t(log1p(fq)))
plot(simModel@W, col = biotrue, main = 'True W\nfZero = 0.75',
xlab = 'W1', ylab = 'W2')
plot(fit@W, col = biotrue, main = 'estimated ZINB W\nfZero = 0.75',
xlab = 'W1', ylab = 'W2')
plot(pca$x, col = biotrue, main = 'PCA FQ\nfZero = 0.75')
plot(zifaFQ[[1]], col = biotrue, main = 'ZIFA FQ\nfZero = 0.75')
Let’s look at model fit when chosen parameters are correct (V, genewise dispersion, K=2).
From Sandrine’s slide.
Mean, variance, and zero probability for the ZINB model are \[E[Y_{i,j}] = (1 - \pi_{i,j})\mu_{i,j},\] \[var(Y_{i,j}) = (1 - \pi_{i,j})\mu_{i,j}(1 + \mu_{i,j}(\phi_j + \pi_{i,j})),\] \[P(Y_{i,j} = 0) = \pi_{i,j} + (1 - \pi_{i,j})(1 + \phi_j \mu_{i,j})^{\frac{1}{\phi_j}}.\]
For the NB model, \(\pi_{i,j}=0\). We fitted the NB using edgeR after full quantile normalization.
computeExp <- function(zinbModel){
(1 - t(getPi(zinbModel))) * t(getMu(zinbModel))
}
computeVar <- function(zinbModel){
mu = t(getMu(zinbModel))
pi = t(getPi(zinbModel))
phi = exp(-getZeta(zinbModel))
(1 - pi) * mu * (1 + mu*(phi + pi))
}
computeP0 <- function(zinbModel){
mu = t(getMu(zinbModel))
pi = t(getPi(zinbModel))
phi = exp(-getZeta(zinbModel))
pi + (1 - pi) * (1 + phi * mu) ^ (-1/phi)
}
plotMD <- function(x, y, xlim = c(0,10), ylim = c(-5, 5),
main = 'ZINB: MD-plot estimated vs. observed mean count, log scale'){
mm = .5*(x + y)
dd = x - y
smoothScatter(mm, dd, xlim = xlim, ylim = ylim, xlab = 'Mean', ylab = 'Diff', main = main)
abline(h = 0, col = 'gray')
fit = loess(dd ~ mm)
xpred = seq(0, 10, .1)
pred = predict(fit, xpred)
lines(xpred, pred, col = 'red', type = 'l', lwd=2)
}
fitNB <- function(rda, i = 1){
load(rda)
counts = t(simData[[i]]$counts)
phenoData = data.frame(bio)
set = newSeqExpressionSet(counts, phenoData = phenoData)
fq = betweenLaneNormalization(set, which = "full", offset = T)
disp = estimateDisp(counts(fq), offset = -offst(fq))
fit = glmFit(counts(fq), dispersion = disp$tagwise.dispersion, offset = -offst(fq))
list(fitted = fit$fitted.values, disp = disp$tagwise.dispersion)
}
There is one outlier for zinb fit.
# zinb
load('./datasets/simZeisel_var2_z45_fitted.rda')
load('./datasets/simZeisel_var2_z45.rda')
zz = fittedSim[[2]][[1]][[2]][[1]]
zinbEY = rowMeans(log1p(computeExp(zz)))
zinbPY0 = rowMeans(computeP0(zz))
# observed
counts = t(simData[[1]]$counts)
logAveCount = rowMeans(log1p(counts))
prop0 = rowMeans(counts == 0)
# edgeR
nb = fitNB('./datasets/simZeisel_var2_z45.rda')
## Design matrix not provided. Switch to the classic mode.
nbEY = rowMeans(log1p(nb$fitted))
nbPY0 = rowMeans((1 + nb$fitted * nb$disp)^(-1/nb$disp))
MD-plot estimated vs. observed mean count, log scale, zoomed (we do not see the outlier)
par(mfrow=c(1,2))
plotMD(logAveCount, zinbEY, xlim = c(0, 4), ylim = c(-2, 1),
main = 'ZINB')
plotMD(logAveCount, nbEY, xlim = c(0, 4), ylim = c(-2, 1),
main = 'NB, edgeR')
MD-plot estimated vs. observed zero probability
par(mfrow=c(1,2))
plotMD(prop0, zinbPY0, xlim = c(0, 1), ylim = c(-1, 1),
main = 'ZINB')
plotMD(prop0, nbPY0, xlim = c(0, 1), ylim = c(-1, 1),
main = 'NB, edgeR')
par(mfrow=c(1,1))
smoothScatter(logAveCount, rowMeans(counts == 0),
xlab = 'log average count', xlim = c(0,4),
ylab = 'proportion of zeros', ylim = c(0,1),
main = 'Zero probability versus Mean')
fit = loess(zinbPY0 ~ zinbEY)
xpred = seq(0,4,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col='green',type = 'l', lwd=2, ylim = c(0,1), xlim = c(0,4))
fit = loess(nbPY0 ~ nbEY)
xpred = seq(0,4,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col='red', type='l', lwd=2, ylim = c(0,1), xlim = c(0,4))
fit = loess(rowMeans(counts == 0) ~ logAveCount)
xpred = seq(0,4,.1)
pred = predict(fit, xpred)
lines(xpred, pred,col = 'blue',type = 'l',lwd=2, ylim = c(0,1), xlim = c(0,4))
legend('topright', c('observed', 'zinb', 'nb, edgeR'),
lty=1, col=c('blue', 'green', 'red'), bty='n', cex=.45)
par(mfrow = c(1, 2))
smoothScatter(exp(-simModel@zeta),exp(-zz@zeta),
xlab = 'True Dispersion', ylab = 'Estimated Dispersion',
ylim = c(0, 10),xlim = c(0,10),
main = 'ZINB')
abline(a = 0, b = 1, col = 'gray')
smoothScatter(exp(-simModel@zeta), nb$disp,
xlab = 'True Dispersion', ylab = 'Estimated Dispersion',
ylim = c(0, 10),xlim = c(0,10),
main = 'NB, edgeR')
abline(a = 0, b = 1, col = 'gray')
par(mfrow = c(1, 1))
xpred = seq(0, 1, .05)
fitzinb = loess(exp(-zz@zeta) ~ rowMeans(counts == 0))
predzinb = predict(fitzinb, xpred)
fitnb = loess(nb$disp ~ rowMeans(counts == 0))
prednb = predict(fitnb, xpred)
smoothScatter(rowMeans(counts == 0), exp(-simModel@zeta),
xlab = 'Observed zero probability',ylim = c(0, 15),
ylab = 'True dispersion',xlim = c(0,1),
main = 'True Dispersion versus observed zero probability')
lines(xpred, predzinb, col = 'green', type = 'l', lwd=2)
lines(xpred, prednb, col = 'red', type = 'l', lwd=2)
legend('topright',c('zinb','nb, edgeR'),lty=1,col=c('green','red'), bty='n', cex=.45)